Package org.terrier.structures.indexing

Source Code of org.terrier.structures.indexing.BlockDocumentPostingList$blockPostings

/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.uk
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is BlockDocumentPostingList.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
*   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
*  
*/
package org.terrier.structures.indexing;
import gnu.trove.THashMap;
import gnu.trove.TIntHashSet;
import gnu.trove.TIntObjectHashMap;
import gnu.trove.TObjectIntProcedure;

import java.util.Arrays;

import org.terrier.sorting.HeapSortInt;
import org.terrier.structures.postings.BlockPosting;
import org.terrier.structures.postings.BlockPostingImpl;
import org.terrier.structures.postings.IterablePosting;
import org.terrier.structures.postings.WritablePosting;
/** Represents the postings of one document, and saves block (term position) information. Uses HashMaps internally.
  * <p>
  * <b>Properties:</b><br>
  * <ul><li><tt>indexing.avg.unique.terms.per.doc</tt> - number of unique terms per doc on average, used to tune the initial
  * size of the haashmaps used in this class.</li></ul>
  * @see org.terrier.structures.indexing.DocumentPostingList
  */
public class BlockDocumentPostingList extends DocumentPostingList
{
  /** mapping term to blockids in this document */
  protected final THashMap<String, TIntHashSet> term_blocks = new THashMap<String, TIntHashSet>(AVG_DOCUMENT_UNIQUE_TERMS);
  /** number of blocks in this document. usually equal to document length, but perhaps less */
  protected int blockCount = 0;
  /** Instantiate a new block document posting list. Saves block information, but no fields */
  public BlockDocumentPostingList() {super();}
 
  /** Insert a term into this document, occurs at given block id */
  public void insert(String t, int blockId)
  {
    insert(t);
    TIntHashSet blockids = null;
    if ((blockids = term_blocks.get(t)) == null)
    {
      term_blocks.put(t, blockids = new TIntHashSet(/*TODO */));
    }
    blockids.add(blockId);
    blockCount++; 
  }

 
  /**
   * return blocks
   * @param term
   * @return int[]
   */
  public int[] getBlocks(String term)
  {
    int[] rtr = term_blocks.get(term).toArray();
    if (rtr ==  null)
      return new int[0];
    Arrays.sort(rtr);
    return rtr;
  }

  /** returns the postings suitable to be written into the block direct index */
  @Override
  public int[][] getPostings()
  {
    final int termCount = occurrences.size();
    final int[] termids = new int[termCount];
    final int[] tfs = new int[termCount];
    final int[] fields = null;
    final int[] blockfreqs = new int[termCount];
    final TIntObjectHashMap<int[]> term2blockids = new TIntObjectHashMap<int[]>();
    int blockTotal = 0; //TODO we already have blockTotal as this.blockCount, so no need to count?
    class PostingVisitor implements TObjectIntProcedure<String> {
      int i=0;
      int blockTotal = 0;
      public boolean execute(final String a, final int b)
      {
        termids[i] = getTermId(a);
        tfs[i] = b;
        final TIntHashSet ids = term_blocks.get(a);
        blockfreqs[i] = ids.size();
        blockTotal += ids.size();
        final int[] bids = ids.toArray();
        Arrays.sort(bids);
        term2blockids.put(termids[i], bids);
        //System.err.println(a+": tid="+termids[i]+" tf="+tfs[i]+" bf="+blockfreqs[i] +" blocks="+Arrays.toString(bids));
        i++;
        return true;
      }
    }
    PostingVisitor proc = new PostingVisitor();
    occurrences.forEachEntry(proc);
    blockTotal = proc.blockTotal;
    HeapSortInt.ascendingHeapSort(termids, tfs,  blockfreqs);
    final int[] blockids = new int[blockTotal];
    int offset = 0;
    for (int termid : termids)
    {
      final int[] src = term2blockids.get(termid);
      final int src_l = src.length;
      System.arraycopy(src, 0, blockids, offset, src_l);
      offset+= src_l;
    }
    return new int[][]{termids, tfs, fields, blockfreqs, blockids};
  }
 
  protected IterablePosting makePostingIterator(String[] _terms, int[] termIds)
  {
    return new blockPostings(_terms, termIds);
  }
 
  class blockPostings extends postingIterator implements BlockPosting
  {
    public blockPostings(String[] _terms, int[] ids) {
      super(_terms, ids);
    }   
   
    /** {@inheritDoc} */
    public int[] getPositions() {
      int[] blockIds = term_blocks.get(terms[i]).toArray();
      Arrays.sort(blockIds);
      return blockIds;
    }

    @Override
    public WritablePosting asWritablePosting() {
      BlockPostingImpl fbp = new BlockPostingImpl(termIds[i], getFrequency(), getPositions());
      return fbp;
    }   
  }
}
TOP

Related Classes of org.terrier.structures.indexing.BlockDocumentPostingList$blockPostings

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.